#-*-coding:utf-8-*-
#!/usr/bin/python

from lib import config, req_sy, db_sy, encoding
from lib import email_sy as email
from lib.logger import logger
from bs4 import BeautifulSoup
import time
import re,sys,os
import json
import gevent.monkey


gevent.monkey.patch_socket()
import gevent
from gevent.pool import Pool

gpool = Pool(config.GPOOLSIZE)
# 设置爬取站点信息全局变量
sites_info = {}

def _pro_content_from_toutiao(url):
	title = ''
	content = ''
	pic = ''
	n_from = ''
	html = req_sy.get_html(url)
	if html.strip() != '':
		try:
			soup = BeautifulSoup(html, 'html.parser', from_encoding = 'utf-8')
			title = soup.title.string[0:-20]
			v_obj = soup.find_all(class_="article-content")
			n_from = soup.find(class_="tth-media-name").text
			if v_obj:
				content = str(v_obj[0])[30:-6]
				pic_ob = soup.find('img',src=re.compile(r"http://p\d\.pstatp\.com/large/.{19}"))
				if pic_ob:
					pic = str(pic_ob['src'])
				else:
					pic = ''

			return title, content,pic,n_from
		except Exception as e:
			logger.error(e)
			email.send_email(u'头条资讯爬取报错', e)

	return title, content, pic, n_from

def _get_page_views_asynchronous(content_urls,catid):
	threads = []
	for url in content_urls:
		threads.append(gpool.spawn(_pro_content_from_toutiao, url))
	gpool.join()

	res = []
	is_error = False
	reason = None
	# 遍历threads, 判断是否成功
	for thread in threads:
		# 加入筛选条件title不能为空
		if thread.successful() and thread.value[0]:
			temp_kw = {}
			temp_kw['title'] = thread.value[0]
			temp_kw['content'] = thread.value[1]
			temp_kw['catid'] = catid
			temp_kw['pic'] = thread.value[2]
			temp_kw['n_from'] = thread.value[3]
			res.append(temp_kw)
	# 若执行过程出错，发送邮件
	# if is_error:
	# 	email.send_email(u'douyu直播爬取error', str(reason))
	return res

# 工具函数：将字符串2016-03-19 09:10转化为Unix时间戳
def _str_to_timestamp(timestr):
	timearr = time.strptime(timestr, '%Y-%m-%d %H:%M')
	return int(time.mktime(timearr))

def _urls_filter(site_id, content_urls):
	# 读取site_id对应的info
	info = sites_info.get(site_id)
	if info != None:
		update_time = info[2]
	# 最新更新时间
	new_update_time = update_time
	res_arr = []
	# 遍历,确认是否最近更新
	for item in content_urls:
		time_temp = _str_to_timestamp(item['time'])
		if time_temp <= update_time:
			continue
		# 将url加入爬取队列
		res_arr.append(item['href'])
		if time_temp > new_update_time:
			new_update_time = time_temp
	print str(new_update_time)
	# 更新全局变量中该站点的最新更新时间
	info[2] = new_update_time
	print "after filter:" + str(len(res_arr))
	return res_arr

def get_list_action(site_id,catid):
	url_pre = "http://toutiao.com/"
	n=20
	while n>0:
		toutiao_url = url_pre + site_id+"/p%s/" % n
		content_urls = []
		#第一次爬取需要取消，之后把注释取消
		kw = {}
		html = req_sy.get_html(toutiao_url)
		if html.strip() != '':
			soup = BeautifulSoup(html, 'html.parser', from_encoding = 'utf-8')
			ul_list = soup.find_all(class_='pin')
			for item in ul_list:
				# 提取文章链接
				info = {}
				info['href'] = item.find(ga_event="source_url")['href']
				info['time'] = item.find(align="right").string
				content_urls.append(info)
			# TODO： 根据文章发布时间筛选需要爬取的文章链接，保证每次爬取只获取最新更新的文章
			content_urls = _urls_filter(site_id, content_urls)
			content_list = _get_page_views_asynchronous(content_urls,catid)

			# 插入数据库操作
			dbconn = db_sy.getConnection()
			db_sy.db_insert(dbconn, content_list, 'toutiao')
		n=n-1

if __name__ == "__main__":
	# 读取sites_info信息
	with open('data/zixun_sites_info.json', 'r') as f:
			sites_str = f.read()
	sites_info = json.loads(sites_str)
	now =  time.time()

	for (site_id, detail) in sites_info.items():
		time.sleep(1)
		catid = detail[0]
		print "**********next site:" + site_id + "*************"
		get_list_action(site_id,catid);
	# 将最新更新时间更新到zixun_sites_info.json文件
	with open('data/zixun_sites_info.json', 'w') as f:
			sites_str = f.write(json.dumps(sites_info))
	print "time cost : " + str(int((time.time() - now))) + " seconds"
